{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fc2a1fce-bd6c-4a5b-b1ba-b2c479e3141d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e12dc616-9110-4009-9d3c-8e0670c08374",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1.9.1+cu111'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7ead511-f089-4eeb-ab30-5254bb308874",
   "metadata": {},
   "source": [
    "# Tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5adc3f3d-3212-47c5-a18a-e559e5d2bdd9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "deb8f5ba-504a-4e8d-b5af-7f5b3cdf0de6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3911f366a93c4805afd7f9e8b70b4d62",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "89d5e432616e405b82a030df747e6c65",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c16be146d92a4e62b8160372a500cc91",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading vocab.txt:   0%|          | 0.00/107k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fbf5a4ce89b149df8b9e59ff71e5ab63",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading tokenizer.json:   0%|          | 0.00/263k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-chinese\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "04be913c-2832-4e25-bf7d-7c8d48a774bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = tokenizer(\"并广泛动员社会各界的力量\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f8e7c2f2-fe10-4e89-a837-0971705c8de1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [101, 2400, 2408, 3793, 1220, 1447, 4852, 833, 1392, 4518, 4638, 1213, 7030, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "dc637cad-f2dd-484a-b62a-f216e61bd3d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[100, 102, 0, 101, 103]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.all_special_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e17ab3f6-6ccb-4264-9fc1-9f8adf923416",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.all_special_tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6972e9aa-c462-400f-aa13-9262a48b0dc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import BertTokenizerFast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "846c68b0-f8db-42d3-8afa-e9300d29fa0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer1 = BertTokenizerFast.from_pretrained(\"bert-base-chinese\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "60c5bf7b-b8ee-4554-95fd-7e03d83e3cd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = tokenizer1(\"并广泛动员社会各界的力量\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "19f9579e-b17e-4240-9633-d6c7328d9f75",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [101, 2400, 2408, 3793, 1220, 1447, 4852, 833, 1392, 4518, 4638, 1213, 7030, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "1dd6a394-0d10-4925-a263-894b2b8cd6a2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [101, 5018, 671, 702, 1368, 2094, 102, 5018, 753, 702, 1368, 2094, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = tokenizer1(\"第一个句子\", \"第二个句子\")\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "3bde9373-df3f-466b-a933-93bde8fa437b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'[CLS] 第 一 个 句 子 [SEP] 第 二 个 句 子 [SEP]'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer1.decode([101, 5018, 671, 702, 1368, 2094, 102, 5018, 753, 702, 1368, 2094, 102])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "f03797d7-b136-46e8-a9cb-ab7ff931aa5d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': [[101, 5018, 671, 1368, 102, 0, 0], [101, 5018, 753, 702, 1368, 2094, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1]]}\n"
     ]
    }
   ],
   "source": [
    "result = tokenizer1([\"第一句\", \"第二个句子\"], padding=True)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6282fca8-7059-4271-ad68-10b9ea23c804",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {
     "02a61ecff88c4b308922856ec40ed914": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "0bc8cb36d2104c249fa2d3f4f6b122f3": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "181fdb8c5c0b4a2bacf012807c84f6d7": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "1a252381c6ad4d8ba1b1f40dfd210168": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "1ceaaadee84648c8bdeae78a5988800c": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "1e2ac9b5111f4daa930bfc546d083664": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "29c92c388618434981a83bb1d97ff5f8": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "2e097f7837274bada987b968bd145384": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "success",
       "layout": "IPY_MODEL_0bc8cb36d2104c249fa2d3f4f6b122f3",
       "max": 624,
       "style": "IPY_MODEL_1a252381c6ad4d8ba1b1f40dfd210168",
       "value": 624
      }
     },
     "3911f366a93c4805afd7f9e8b70b4d62": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_ee8ed113519f4f47abfe0538191cf3f2",
        "IPY_MODEL_df50d935ed7a4393ac51df640b5bea0a",
        "IPY_MODEL_5d5f4e7b6f314288b0924620cfc99a6b"
       ],
       "layout": "IPY_MODEL_5caeeb5b954741c2adc517445132e782"
      }
     },
     "39863ca2a0624bce8407d97abf492eb5": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "3ec76e1db6c1400d88c410a3b1bdfb8e": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_c1948c8f8a3c4cc1a91e99e68236275b",
       "style": "IPY_MODEL_4ab67bc7bb1f4c7d89a68b270acdadec",
       "value": " 624/624 [00:00&lt;00:00, 36.6kB/s]"
      }
     },
     "418cf10c219f4618bdb6a2d2aea33309": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "4ab67bc7bb1f4c7d89a68b270acdadec": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "4c2eff1e74a943e990945ce85a2a255c": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "success",
       "layout": "IPY_MODEL_02a61ecff88c4b308922856ec40ed914",
       "max": 109540,
       "style": "IPY_MODEL_4cacab69b7f14f36bf8be8fe0dcdbb7e",
       "value": 109540
      }
     },
     "4cacab69b7f14f36bf8be8fe0dcdbb7e": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "507502839dd940cc8f30708caa327672": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_1e2ac9b5111f4daa930bfc546d083664",
       "style": "IPY_MODEL_5dd8464f5e404408a810672064479303",
       "value": "Downloading vocab.txt: 100%"
      }
     },
     "52a047d6632f491cb56e54105dcf6d12": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "5869f5326acf4e058892e522d2321428": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_7e8b88790e0d4086a63a8b959606390b",
       "style": "IPY_MODEL_e993cef78fe94a17a481a518781235c5",
       "value": " 263k/263k [00:00&lt;00:00, 362kB/s]"
      }
     },
     "5b8cf802451347edac2e3472d83be5f1": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "5caeeb5b954741c2adc517445132e782": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "5d5f4e7b6f314288b0924620cfc99a6b": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_f89c4be311dc45f3b68cd8e96e8e5b59",
       "style": "IPY_MODEL_ec56cd1cdbf840bda5a405c24c3563a6",
       "value": " 29.0/29.0 [00:00&lt;00:00, 2.17kB/s]"
      }
     },
     "5dd8464f5e404408a810672064479303": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "63c21f29e69b45339bec350e59c6fcb2": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_418cf10c219f4618bdb6a2d2aea33309",
       "style": "IPY_MODEL_6ff8dfc4a9dc4de68fad85f1bc608741",
       "value": " 107k/107k [00:00&lt;00:00, 254kB/s]"
      }
     },
     "6ff8dfc4a9dc4de68fad85f1bc608741": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "7e8b88790e0d4086a63a8b959606390b": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "82f69ab676b44652a1db1c76b9706498": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "88da7f7ce1324993a910f05e184fb17c": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "89d5e432616e405b82a030df747e6c65": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_b620919f6e6340b482e5fe438a0a0a7a",
        "IPY_MODEL_2e097f7837274bada987b968bd145384",
        "IPY_MODEL_3ec76e1db6c1400d88c410a3b1bdfb8e"
       ],
       "layout": "IPY_MODEL_82f69ab676b44652a1db1c76b9706498"
      }
     },
     "941323b5d6554e8d8b764eac7273c2c4": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "9b92b77efbdf4ef39fccda7963da834e": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "9cc883087ff84ea6a44dbb0b75fde8ca": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "b2fdff22e3974c00857c887af38db422": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "b620919f6e6340b482e5fe438a0a0a7a": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_9b92b77efbdf4ef39fccda7963da834e",
       "style": "IPY_MODEL_181fdb8c5c0b4a2bacf012807c84f6d7",
       "value": "Downloading config.json: 100%"
      }
     },
     "c16be146d92a4e62b8160372a500cc91": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_507502839dd940cc8f30708caa327672",
        "IPY_MODEL_4c2eff1e74a943e990945ce85a2a255c",
        "IPY_MODEL_63c21f29e69b45339bec350e59c6fcb2"
       ],
       "layout": "IPY_MODEL_1ceaaadee84648c8bdeae78a5988800c"
      }
     },
     "c1948c8f8a3c4cc1a91e99e68236275b": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "dcb29d9e8aaa4c88b005f72e1db1a184": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "success",
       "layout": "IPY_MODEL_88da7f7ce1324993a910f05e184fb17c",
       "max": 268943,
       "style": "IPY_MODEL_39863ca2a0624bce8407d97abf492eb5",
       "value": 268943
      }
     },
     "de84f949f701477a890d01d7d8040dee": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_941323b5d6554e8d8b764eac7273c2c4",
       "style": "IPY_MODEL_9cc883087ff84ea6a44dbb0b75fde8ca",
       "value": "Downloading tokenizer.json: 100%"
      }
     },
     "df50d935ed7a4393ac51df640b5bea0a": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "success",
       "layout": "IPY_MODEL_5b8cf802451347edac2e3472d83be5f1",
       "max": 29,
       "style": "IPY_MODEL_b2fdff22e3974c00857c887af38db422",
       "value": 29
      }
     },
     "e993cef78fe94a17a481a518781235c5": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "ec56cd1cdbf840bda5a405c24c3563a6": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "ee8ed113519f4f47abfe0538191cf3f2": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_fb731585b0ae4280819d8ba1b3fc927f",
       "style": "IPY_MODEL_52a047d6632f491cb56e54105dcf6d12",
       "value": "Downloading tokenizer_config.json: 100%"
      }
     },
     "f89c4be311dc45f3b68cd8e96e8e5b59": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "fb731585b0ae4280819d8ba1b3fc927f": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "fbf5a4ce89b149df8b9e59ff71e5ab63": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_de84f949f701477a890d01d7d8040dee",
        "IPY_MODEL_dcb29d9e8aaa4c88b005f72e1db1a184",
        "IPY_MODEL_5869f5326acf4e058892e522d2321428"
       ],
       "layout": "IPY_MODEL_29c92c388618434981a83bb1d97ff5f8"
      }
     }
    },
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
